Out of 30 samples, we selected 18 for this study. These are the normal tissue samples form the control, the UVA and the UVA+SFN treatment groups. normal tissue samples from the UVB_UA groups as well as tumor samples were excluded from this analysis.
First, we removed 7,148 genes with zero counts in > 80% (> 14 out of 18) of samples. 17,273 out of 24,421 genes left.
[1] 7148
[1] 17273
Next, we noramized the counts. To convert number of hits to the relative abundane of genes in each sample, we used transcripts per kilobase million (TPM) normalization, which is as following for the j-th sample:
1. normilize for gene length: a[i, j] = 1,000*count[i, j]/gene[i, j] length(bp)
2. normalize for seq depth (i.e. total count): a(i, j)/sum(a[, j])
3. multiply by one million
A very good comparison of normalization techniques can be found at the following video:
RPKM, FPKM and TPM, clearly explained
After the normalization, each sample’s total is 1M:
02w_CON_0 02w_CON_1 02w_SFN_0 02w_SFN_1 02w_UVB_0 02w_UVB_1 15w_CON_0 15w_CON_1 15w_SFN_0 15w_SFN_1 15w_UVB_0
1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06
15w_UVB_1 25w_CON_0 25w_CON_1 25w_SFN_0 25w_SFN_1 25w_UVB_0 25w_UVB_1
1e+06 1e+06 1e+06 1e+06 1e+06 1e+06 1e+06
# Separate top 100 abundant genes
tmp <- droplevels(tpm[Geneid %in% levels(tpm$Geneid)[(nrow(tpm) - 99):nrow(tpm)]])
tmp <- melt.data.table(data = tmp,
id.vars = 1:2,
measure.vars = 3:ncol(tmp),
variable.name = "Sample",
value.name = "TPM")
tmp$Week <- substr(x = tmp$Sample,
start = 1,
stop = 3)
tmp$Week <- factor(tmp$Week,
levels = unique(tmp$Week))
tmp$Treatment <- substr(x = tmp$Sample,
start = 5,
stop = 7)
tmp$Treatment <- factor(tmp$Treatment,
levels = c("CON",
"UVB",
"SFN"))
tmp$Replica <- substr(x = tmp$Sample,
start = 9,
stop = 9)
tmp$Replica <- factor(tmp$Replica,
levels = 0:1)
# Plot top 100 abundant genes
p2 <- ggplot(tmp,
aes(x = TPM,
y = Geneid,
fill = Treatment,
shape = Week)) +
# facet_wrap(~ Sex, nrow = 1) +
geom_point(size = 3,
alpha = 0.5) +
geom_vline(xintercept = 1,
linetype = "dashed")
ggplotly(p2)
tmp <- droplevels(tpm[Geneid %in% levels(tpm$Geneid)[1:100]])
tmp <- melt.data.table(data = tmp,
id.vars = 1:2,
measure.vars = 3:ncol(tmp),
variable.name = "Sample",
value.name = "TPM")
tmp$Week <- substr(x = tmp$Sample,
start = 1,
stop = 3)
tmp$Week <- factor(tmp$Week,
levels = unique(tmp$Week))
tmp$Treatment <- substr(x = tmp$Sample,
start = 5,
stop = 7)
tmp$Treatment <- factor(tmp$Treatment,
levels = c("CON",
"UVB",
"SFN"))
tmp$Replica <- substr(x = tmp$Sample,
start = 9,
stop = 9)
tmp$Replica <- factor(tmp$Replica,
levels = 0:1)
# Plot top 100 abundant genes
p3 <- ggplot(tmp,
aes(x = TPM,
y = Geneid,
fill = Treatment,
shape = Week)) +
# facet_wrap(~ Sex, nrow = 1) +
geom_point(size = 3,
alpha = 0.5) +
geom_vline(xintercept = 1,
linetype = "dashed")
ggplotly(p3)
dmeta <- data.table(Sample = colnames(dt1)[-c(1:2)])
dmeta$time <- substr(x = dmeta$Sample,
start = 1,
stop = 3)
dmeta$time <- factor(dmeta$time,
levels = c("02w",
"15w",
"25w"))
dmeta$Week <- factor(dmeta$time,
levels = c("02w",
"15w",
"25w"),
labels = c("Week 2",
"Week 15",
"Week 25"))
dmeta$trt <- substr(x = dmeta$Sample,
start = 5,
stop = 7)
dmeta$trt <- factor(dmeta$trt,
levels = c("CON",
"UVB",
"SFN"))
dmeta$Treatment <- factor(dmeta$trt,
levels = c("CON",
"UVB",
"SFN"),
labels = c("Negative Control",
"Positive Control (UVB)",
"Sulforaphane (SFN)"))
dmeta$Replica <- substr(x = dmeta$Sample,
start = 9,
stop = 9)
dmeta$Replica <- factor(dmeta$Replica,
levels = 0:1)
datatable(dmeta,
options = list(pageLength = nrow(dmeta)))
NOTE: the distributions are skewed. To make them symmetric, log transformation is often applied. However, there is an issue of zeros. In this instance, we added a small values lambda[i] equal to 1/10 of the smallest non-zero value of i-th gene.
dm.tpm <- as.matrix(tpm[, -c(1:2), with = FALSE])
rownames(dm.tpm) <- tpm$Geneid
# Add lambdas to all values, then take a log
dm.ltpm <- t(apply(X = dm.tpm,
MARGIN = 1,
FUN = function(a) {
lambda <- min(a[a > 0])/10
log(a + lambda)
}))
# PCA----
m1 <- prcomp(t(dm.ltpm),
center = TRUE,
scale. = TRUE)
summary(m1)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
Standard deviation 70.7928 56.9107 50.8898 28.84564 26.51968 24.81005 23.85276 22.63644 20.97344 20.20442
Proportion of Variance 0.2901 0.1875 0.1499 0.04817 0.04072 0.03564 0.03294 0.02967 0.02547 0.02363
Cumulative Proportion 0.2901 0.4777 0.6276 0.67575 0.71647 0.75211 0.78504 0.81471 0.84018 0.86381
PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18
Standard deviation 19.24099 19.01279 18.73783 18.53642 17.87923 17.65132 17.16891 2.134e-13
Proportion of Variance 0.02143 0.02093 0.02033 0.01989 0.01851 0.01804 0.01707 0.000e+00
Cumulative Proportion 0.88524 0.90617 0.92650 0.94639 0.96490 0.98293 1.00000 1.000e+00
plot(m1)
# Biplot while keep only the most important variables (Javier)----
# Select PC-s to pliot (PC1 & PC2)
choices <- c(1:3)
# Scores, i.e. points (df.u)
dt.scr <- data.table(m1$x[, choices])
# Add grouping variables
dt.scr$trt <- dmeta$trt
dt.scr$time <- dmeta$time
dt.scr$sample <- dmeta$Sample
# Loadings, i.e. arrows (df.v)
dt.rot <- as.data.frame(m1$rotation[, choices])
dt.rot$feat <- rownames(dt.rot)
dt.rot <- data.table(dt.rot)
# Axis labels
u.axis.labs <- paste(colnames(dt.rot)[choices],
sprintf('(%0.1f%% explained var.)',
100*m1$sdev[choices]^2/sum(m1$sdev^2)))
p1 <- ggplot(data = dt.scr,
aes(x = PC1,
y = PC2,
color = trt,
shape = time)) +
geom_point(size = 4,
alpha = 0.5) +
scale_x_continuous(u.axis.labs[1]) +
scale_y_continuous(u.axis.labs[2])
ggplotly(p1)
p1 <- ggplot(data = dt.scr,
aes(x = PC1,
y = PC3,
color = trt,
shape = time)) +
geom_point(size = 4,
alpha = 0.5) +
scale_x_continuous(u.axis.labs[1]) +
scale_y_continuous(u.axis.labs[3])
ggplotly(p1)
p1 <- ggplot(data = dt.scr,
aes(x = PC2,
y = PC3,
color = trt,
shape = time)) +
geom_point(size = 4,
alpha = 0.5) +
scale_x_continuous(u.axis.labs[2]) +
scale_y_continuous(u.axis.labs[3])
ggplotly(p1)
scatterplot3js(x = dt.scr$PC1,
y = dt.scr$PC2,
z = dt.scr$PC3,
color = as.numeric(dt.scr$trt),
renderer = "auto",
pch = dt.scr$sample,
size = 0.1)
dtm <- as.matrix(dt1[, -c(1:2), with = FALSE])
rownames(dtm) <- dt1$Geneid
dds <- DESeqDataSetFromMatrix(countData = dtm,
colData = dmeta,
~ trt + time)
# If all samples contain zeros, geometric means cannot be
# estimated. Change default 'type = "ratio"' to 'type = "poscounts"'.
# Type '?DESeq2::estimateSizeFactors' for more details.
dds <- estimateSizeFactors(dds,
type = "poscounts")
dds
class: DESeqDataSet
dim: 17273 18
metadata(1): version
assays(1): counts
rownames(17273): Xkr4 Mrpl15 ... Zf12 Erdr1
rowData names(0):
colnames(18): 02w_CON_0 02w_CON_1 ... 25w_UVB_0 25w_UVB_1
colData names(7): Sample time ... Replica sizeFactor
# # Set cores for parallel processing of DESeq----
# snowparam <- SnowParam(workers = snowWorkers(),
# type = "SOCK")
# register(snowparam,
# default = TRUE)
# Run DESeq----
dds <- DESeq(dds,
fitType = "local",
parallel = TRUE)
using pre-existing size factors
estimating dispersions
gene-wise dispersion estimates: 2 workers
'package:stats' may not be available when loading'package:stats' may not be available when loadingmean-dispersion relationship
final dispersion estimates, fitting model and testing: 2 workers
'package:stats' may not be available when loading'package:stats' may not be available when loading
results(dds)
log2 fold change (MLE): time 25w vs 02w
Wald test p-value: time 25w vs 02w
DataFrame with 17273 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue
<numeric> <numeric> <numeric> <numeric> <numeric>
Xkr4 0.372637436040936 0.347674453012715 1.85290358759446 0.187637638213051 0.851160719227371
Mrpl15 473.968965072559 -0.12450153923719 0.0714282207122831 -1.74303010764736 0.0813283558381948
Lypla1 1265.59579157719 -0.523495263510286 0.0738820243170689 -7.08555657955006 1.38486574082199e-12
Tcea1 347.786701718151 -0.266798692902443 0.0877732341254237 -3.03963612097511 0.00236864137641766
Rgs20 393.200729771858 -0.465618636782497 0.112301880629506 -4.14613392200096 3.38136101125416e-05
... ... ... ... ... ...
Vamp7 463.037463939562 -0.397578416634149 0.106731090646671 -3.72504782088582 0.000195278199520461
Spry3 4.03688536326271 -1.37585526169675 0.714812221938873 -1.9247785914528 0.0542570634088351
Tmlhe 41.145292290645 -0.469773091455135 0.222960586656882 -2.10697818165538 0.0351194708601783
Zf12 0.216082295697271 0.0868343182457816 2.02714964711862 0.0428356724276411 0.965832527590861
Erdr1 19.5763157162845 0.268086319281388 0.59510270258956 0.45048748411799 0.652358976804483
padj
<numeric>
Xkr4 NA
Mrpl15 0.173491055505417
Lypla1 3.51250323704544e-10
Tcea1 0.0109771852968187
Rgs20 0.00035467560890265
... ...
Vamp7 0.00144782110421568
Spry3 0.127293298490708
Tmlhe 0.0916707944323755
Zf12 NA
Erdr1 0.767483982679305
resultsNames(dds)
[1] "Intercept" "trt_UVB_vs_CON" "trt_SFN_vs_CON" "time_15w_vs_02w" "time_25w_vs_02w"
colData(dds)
DataFrame with 18 rows and 7 columns
Sample time Week trt Treatment Replica sizeFactor
<character> <factor> <factor> <factor> <factor> <factor> <numeric>
02w_CON_0 02w_CON_0 02w Week 2 CON Negative Control 0 0.948964359312974
02w_CON_1 02w_CON_1 02w Week 2 CON Negative Control 1 0.434767245904826
02w_SFN_0 02w_SFN_0 02w Week 2 SFN Sulforaphane (SFN) 0 0.837004805606802
02w_SFN_1 02w_SFN_1 02w Week 2 SFN Sulforaphane (SFN) 1 0.937831868319037
02w_UVB_0 02w_UVB_0 02w Week 2 UVB Positive Control (UVB) 0 0.949001244715886
... ... ... ... ... ... ... ...
25w_CON_1 25w_CON_1 25w Week 25 CON Negative Control 1 1.14536599296614
25w_SFN_0 25w_SFN_0 25w Week 25 SFN Sulforaphane (SFN) 0 1.03002696492621
25w_SFN_1 25w_SFN_1 25w Week 25 SFN Sulforaphane (SFN) 1 1.07022971587466
25w_UVB_0 25w_UVB_0 25w Week 25 UVB Positive Control (UVB) 0 1.21285056237317
25w_UVB_1 25w_UVB_1 25w Week 25 UVB Positive Control (UVB) 1 1.01927326280773
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] threejs_0.3.1 igraph_1.2.4.1 plotly_4.9.0 ggplot2_3.1.1
[5] readxl_1.3.1 DESeq2_1.24.0 SummarizedExperiment_1.14.0 DelayedArray_0.10.0
[9] BiocParallel_1.17.18 matrixStats_0.54.0 Biobase_2.44.0 GenomicRanges_1.36.0
[13] GenomeInfoDb_1.20.0 IRanges_2.18.1 S4Vectors_0.22.0 BiocGenerics_0.30.0
[17] DT_0.6 data.table_1.12.2 knitr_1.22
loaded via a namespace (and not attached):
[1] httr_1.4.0 tidyr_0.8.3 viridisLite_0.3.0 jsonlite_1.6
[5] bit64_0.9-7 splines_3.6.0 shiny_1.3.2 Formula_1.2-3
[9] assertthat_0.2.1 latticeExtra_0.6-28 blob_1.1.1 GenomeInfoDbData_1.2.1
[13] cellranger_1.1.0 yaml_2.2.0 pillar_1.4.1 RSQLite_2.1.1
[17] backports_1.1.4 lattice_0.20-38 glue_1.3.1 digest_0.6.19
[21] promises_1.0.1 RColorBrewer_1.1-2 XVector_0.24.0 checkmate_1.9.3
[25] colorspace_1.4-1 httpuv_1.5.1 htmltools_0.3.6 Matrix_1.2-17
[29] plyr_1.8.4 XML_3.98-1.19 pkgconfig_2.0.2 genefilter_1.66.0
[33] zlibbioc_1.30.0 purrr_0.3.2 xtable_1.8-4 snow_0.4-3
[37] scales_1.0.0 later_0.8.0 htmlTable_1.13.1 tibble_2.1.2
[41] annotate_1.62.0 withr_2.1.2 nnet_7.3-12 lazyeval_0.2.2
[45] mime_0.6 survival_2.44-1.1 magrittr_1.5 crayon_1.3.4
[49] memoise_1.1.0 foreign_0.8-71 tools_3.6.0 stringr_1.4.0
[53] munsell_0.5.0 locfit_1.5-9.1 cluster_2.0.9 AnnotationDbi_1.46.0
[57] compiler_3.6.0 rlang_0.3.4 grid_3.6.0 RCurl_1.95-4.12
[61] rstudioapi_0.10 htmlwidgets_1.3 crosstalk_1.0.0 labeling_0.3
[65] bitops_1.0-6 base64enc_0.1-3 gtable_0.3.0 DBI_1.0.0
[69] R6_2.4.0 gridExtra_2.3 dplyr_0.8.1 bit_1.1-14
[73] Hmisc_4.2-0 stringi_1.4.3 Rcpp_1.0.1 geneplotter_1.62.0
[77] rpart_4.1-15 acepack_1.4.1 tidyselect_0.2.5 xfun_0.7